#load needed packages. make sure they are installed.
pacman::p_load(here, knitr, tidyverse, skimr, fpp2, tigris, plotly, gganimate, viridis, transformr)
theme_set(theme_minimal())An example exploratory analysis script
Setup
Load the data.
d1 <- readRDS(here('data','processed-data','processed-crime.rds'))
zips <- read_csv(here('data','raw-data','austin-zip-codes.csv'))Rows: 54 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): the_geom, AFFGEOID10
dbl (4): Zip Code, GEOID10, ALAND10, AWATER10
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data exploration through tables
Showing a bit of code to produce and save a summary table.
summary(d1) Incident.Number Highest.Offense.Description Highest.Offense.Code
Min. :2.004e+04 Length:2461621 Min. : 100
1st Qu.:2.005e+10 Class :character 1st Qu.: 601
Median :2.011e+10 Mode :character Median :1199
Mean :6.032e+10 Mean :1689
3rd Qu.:2.017e+10 3rd Qu.:2716
Max. :2.024e+12 Max. :8905
Family.Violence Occurred.Date.Time Occurred.Date
Length:2461621 Min. :2003-01-01 00:00:00.00 Min. :2003-01-01
Class :character 1st Qu.:2007-11-16 16:49:00.00 1st Qu.:2007-11-16
Mode :character Median :2012-05-28 23:09:00.00 Median :2012-05-28
Mean :2012-11-25 19:50:28.18 Mean :2012-11-25
3rd Qu.:2017-10-26 21:19:00.00 3rd Qu.:2017-10-26
Max. :2024-06-01 23:46:00.00 Max. :2024-06-01
Occurred.Time Report.Date.Time Report.Date
Length:2461621 Min. :2002-11-29 05:30:00.00 Min. :2002-11-29
Class1:hms 1st Qu.:2007-11-27 22:41:00.00 1st Qu.:2007-11-27
Class2:difftime Median :2012-06-06 11:15:00.00 Median :2012-06-06
Mode :numeric Mean :2012-12-04 16:45:39.02 Mean :2012-12-04
3rd Qu.:2017-11-05 01:56:00.00 3rd Qu.:2017-11-05
Max. :2024-06-02 01:20:00.00 Max. :2024-06-02
Report.Time Location.Type Address Zip.Code
Length:2461621 Length:2461621 Length:2461621 Min. :76574
Class1:hms Class :character Class :character 1st Qu.:78717
Class2:difftime Mode :character Mode :character Median :78741
Mode :numeric Mean :78732
3rd Qu.:78752
Max. :78759
Council.District APD.Sector APD.District PRA
Min. : 1.000 Length:2461621 Length:2461621 Length:2461621
1st Qu.: 3.000 Class :character Class :character Class :character
Median : 4.000 Mode :character Mode :character Mode :character
Mean : 4.965
3rd Qu.: 7.000
Max. :10.000
NA's :30699
Census.Tract Clearance.Status Clearance.Date UCR.Category
Min. : 1.0 Length:2461621 Min. :2003-01-01 Length:2461621
1st Qu.: 15.0 Class :character 1st Qu.:2008-04-07 Class :character
Median : 23.2 Mode :character Median :2012-10-17 Mode :character
Mean : 245.4 Mean :2013-03-14
3rd Qu.: 338.0 3rd Qu.:2018-01-19
Max. :950800.0 Max. :2024-06-02
NA's :8822 NA's :348308
Category.Description X.coordinate Y.coordinate Latitude
Length:2461621 Min. : 0 Min. : 0 Min. :30.01
Class :character 1st Qu.:3108421 1st Qu.:10057433 1st Qu.:30.23
Mode :character Median :3117292 Median :10073004 Median :30.28
Mean :3075787 Mean : 9946761 Mean :30.29
3rd Qu.:3126595 3rd Qu.:10100561 3rd Qu.:30.35
Max. :3231806 Max. :10215496 Max. :30.67
NA's :32335
Longitude Location
Min. :-98.18 Length:2461621
1st Qu.:-97.76 Class :character
Median :-97.73 Mode :character
Mean :-97.73
3rd Qu.:-97.70
Max. :-97.37
NA's :32335
head(d1) Incident.Number Highest.Offense.Description Highest.Offense.Code
1 2013851154 SEXUAL ASSAULT OF CHILD/OBJECT 1707
2 20161800084 RAPE OF A CHILD 204
3 2010701921 RAPE 200
4 20071820003 RAPE 200
5 20062192048 SEXUAL ASSAULT W/ OBJECT 1700
6 20033211543 RAPE 200
Family.Violence Occurred.Date.Time Occurred.Date Occurred.Time
1 Y 2009-01-01 00:01:00 2009-01-01 00:01:00
2 Y 2016-06-28 01:05:00 2016-06-28 01:05:00
3 Y 2010-03-04 19:15:00 2010-03-04 19:15:00
4 N 2007-07-01 12:00:00 2007-07-01 12:00:00
5 N 2006-08-07 22:28:00 2006-08-07 22:28:00
6 Y 2003-11-17 14:00:00 2003-11-17 14:00:00
Report.Date.Time Report.Date Report.Time Location.Type
1 2013-03-26 16:56:00 2013-03-26 16:56:00 RESIDENCE / HOME
2 2016-06-28 01:05:00 2016-06-28 01:05:00 RESIDENCE / HOME
3 2010-03-11 17:06:00 2010-03-11 17:06:00 RESIDENCE / HOME
4 2007-07-01 12:00:00 2007-07-01 12:00:00 RESIDENCE / HOME
5 2006-08-07 22:28:00 2006-08-07 22:28:00 RESIDENCE / HOME
6 2003-11-17 21:40:00 2003-11-17 21:40:00 RESIDENCE / HOME
Address Zip.Code Council.District APD.Sector APD.District
1 900 BLOCK E 32ND ST 78705 9 BA 1
2 6900 BLOCK BRANCHWOOD DR 78744 2 FR 8
3 400 BLOCK ANGEL OAK ST 78748 5 FR 2
4 1700 BLOCK WOOTEN DR 78757 7 ID 7
5 500 BLOCK E OLTORF ST 78704 9 DA 2
6 7300 BLOCK DANJEAN DR 78745 NA DA 6
PRA Census.Tract Clearance.Status Clearance.Date UCR.Category
1 348 4.00 C 2013-04-11 11C
2 530 24.41 C 2016-07-01 11A
3 542 24.38 C 2010-03-18 11A
4 247 405.00 O 2007-08-02 11A
5 479 23.23 2006-08-22 11C
6 525 1728.00 O 2003-11-30 11A
Category.Description X.coordinate Y.coordinate Latitude Longitude Location
1 Rape 0 0 NA NA
2 Rape 0 0 NA NA
3 Rape 0 0 NA NA
4 Rape 0 0 NA NA
5 Rape 0 0 NA NA
6 Rape 0 0 NA NA
str(d1)'data.frame': 2461621 obs. of 27 variables:
$ Incident.Number : num 2.01e+09 2.02e+10 2.01e+09 2.01e+10 2.01e+10 ...
$ Highest.Offense.Description: chr "SEXUAL ASSAULT OF CHILD/OBJECT" "RAPE OF A CHILD" "RAPE" "RAPE" ...
$ Highest.Offense.Code : int 1707 204 200 200 1700 200 902 2703 200 2006 ...
$ Family.Violence : chr "Y" "Y" "Y" "N" ...
$ Occurred.Date.Time : POSIXct, format: "2009-01-01 00:01:00" "2016-06-28 01:05:00" ...
$ Occurred.Date : Date, format: "2009-01-01" "2016-06-28" ...
$ Occurred.Time : 'hms' num 00:01:00 01:05:00 19:15:00 12:00:00 ...
..- attr(*, "units")= chr "secs"
$ Report.Date.Time : POSIXct, format: "2013-03-26 16:56:00" "2016-06-28 01:05:00" ...
$ Report.Date : Date, format: "2013-03-26" "2016-06-28" ...
$ Report.Time : 'hms' num 16:56:00 01:05:00 17:06:00 12:00:00 ...
..- attr(*, "units")= chr "secs"
$ Location.Type : chr "RESIDENCE / HOME" "RESIDENCE / HOME" "RESIDENCE / HOME" "RESIDENCE / HOME" ...
$ Address : chr "900 BLOCK E 32ND ST" "6900 BLOCK BRANCHWOOD DR" "400 BLOCK ANGEL OAK ST" "1700 BLOCK WOOTEN DR" ...
$ Zip.Code : int 78705 78744 78748 78757 78704 78745 78702 78759 78705 78741 ...
$ Council.District : int 9 2 5 7 9 NA 3 10 9 3 ...
$ APD.Sector : chr "BA" "FR" "FR" "ID" ...
$ APD.District : chr "1" "8" "2" "7" ...
$ PRA : chr "348" "530" "542" "247" ...
$ Census.Tract : num 4 24.4 24.4 405 23.2 ...
$ Clearance.Status : chr "C" "C" "C" "O" ...
$ Clearance.Date : Date, format: "2013-04-11" "2016-07-01" ...
$ UCR.Category : chr "11C" "11A" "11A" "11A" ...
$ Category.Description : chr "Rape" "Rape" "Rape" "Rape" ...
$ X.coordinate : int 0 0 0 0 0 0 0 0 0 0 ...
$ Y.coordinate : int 0 0 0 0 0 0 0 0 0 0 ...
$ Latitude : num NA NA NA NA NA NA NA NA NA NA ...
$ Longitude : num NA NA NA NA NA NA NA NA NA NA ...
$ Location : chr "" "" "" "" ...
skim(d1)| Name | d1 |
| Number of rows | 2461621 |
| Number of columns | 27 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| Date | 3 |
| difftime | 2 |
| numeric | 9 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Highest.Offense.Description | 0 | 1 | 3 | 48 | 0 | 436 | 0 |
| Family.Violence | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| Location.Type | 0 | 1 | 7 | 47 | 0 | 47 | 0 |
| Address | 0 | 1 | 8 | 74 | 0 | 246951 | 0 |
| APD.Sector | 0 | 1 | 2 | 5 | 0 | 14 | 0 |
| APD.District | 0 | 1 | 1 | 2 | 0 | 21 | 0 |
| PRA | 0 | 1 | 1 | 4 | 0 | 742 | 0 |
| Clearance.Status | 0 | 1 | 0 | 1 | 615856 | 4 | 0 |
| UCR.Category | 0 | 1 | 0 | 3 | 1550375 | 17 | 0 |
| Category.Description | 0 | 1 | 0 | 18 | 1550375 | 8 | 0 |
| Location | 0 | 1 | 0 | 27 | 32335 | 219842 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Occurred.Date | 0 | 1.00 | 2003-01-01 | 2024-06-01 | 2012-05-28 | 7823 |
| Report.Date | 0 | 1.00 | 2002-11-29 | 2024-06-02 | 2012-06-06 | 7825 |
| Clearance.Date | 348308 | 0.86 | 2003-01-01 | 2024-06-02 | 2012-10-17 | 7814 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Occurred.Time | 0 | 1 | 0 secs | 86340 secs | 14:25:00 | 1440 |
| Report.Time | 0 | 1 | 0 secs | 86340 secs | 14:06:00 | 1440 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Incident.Number | 0 | 1.00 | 6.031558e+10 | 2.896224e+11 | 20035.00 | 2.005329e+10 | 2.010505e+10 | 2.017186e+10 | 2.024242e+12 | ▇▁▁▁▁ |
| Highest.Offense.Code | 0 | 1.00 | 1.689080e+03 | 1.218280e+03 | 100.00 | 6.010000e+02 | 1.199000e+03 | 2.716000e+03 | 8.905000e+03 | ▇▅▁▁▁ |
| Zip.Code | 0 | 1.00 | 7.873243e+04 | 2.510000e+01 | 76574.00 | 7.871700e+04 | 7.874100e+04 | 7.875200e+04 | 7.875900e+04 | ▁▁▁▁▇ |
| Council.District | 30699 | 0.99 | 4.960000e+00 | 2.840000e+00 | 1.00 | 3.000000e+00 | 4.000000e+00 | 7.000000e+00 | 1.000000e+01 | ▅▇▃▃▅ |
| Census.Tract | 8822 | 1.00 | 2.453700e+02 | 3.363970e+03 | 1.00 | 1.500000e+01 | 2.324000e+01 | 3.380000e+02 | 9.508000e+05 | ▇▁▁▁▁ |
| X.coordinate | 0 | 1.00 | 3.075787e+06 | 3.551571e+05 | 0.00 | 3.108421e+06 | 3.117292e+06 | 3.126595e+06 | 3.231806e+06 | ▁▁▁▁▇ |
| Y.coordinate | 0 | 1.00 | 9.946761e+06 | 1.147895e+06 | 0.00 | 1.005743e+07 | 1.007300e+07 | 1.010056e+07 | 1.021550e+07 | ▁▁▁▁▇ |
| Latitude | 32335 | 0.99 | 3.029000e+01 | 8.000000e-02 | 30.01 | 3.023000e+01 | 3.028000e+01 | 3.035000e+01 | 3.067000e+01 | ▁▇▇▂▁ |
| Longitude | 32335 | 0.99 | -9.773000e+01 | 5.000000e-02 | -98.18 | -9.776000e+01 | -9.773000e+01 | -9.770000e+01 | -9.737000e+01 | ▁▁▇▂▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Occurred.Date.Time | 0 | 1 | 2003-01-01 00:00:00 | 2024-06-01 23:46:00 | 2012-05-28 23:09:00 | 1738386 |
| Report.Date.Time | 0 | 1 | 2002-11-29 05:30:00 | 2024-06-02 01:20:00 | 2012-06-06 11:15:00 | 2169726 |
summary(zips) the_geom Zip Code AFFGEOID10 GEOID10
Length:54 Min. :78610 Length:54 Min. :78610
Class :character 1st Qu.:78704 Class :character 1st Qu.:78704
Mode :character Median :78730 Mode :character Median :78730
Mean :78716 Mean :78716
3rd Qu.:78745 3rd Qu.:78745
Max. :78759 Max. :78759
ALAND10 AWATER10
Min. : 824603 Min. : 0
1st Qu.: 19996385 1st Qu.: 0
Median : 33872800 Median : 45636
Mean : 54514717 Mean : 815147
3rd Qu.: 54926871 3rd Qu.: 541028
Max. :327324441 Max. :10884130
head(zips)# A tibble: 6 × 6
the_geom `Zip Code` AFFGEOID10 GEOID10 ALAND10 AWATER10
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 MULTIPOLYGON (((-97.5275109999… 78612 8600000US… 78612 2.43e8 1463994
2 MULTIPOLYGON (((-97.8449419999… 78745 8600000US… 78745 3.46e7 0
3 MULTIPOLYGON (((-97.7376429999… 78751 8600000US… 78751 6.21e6 0
4 MULTIPOLYGON (((-97.737174 30.… 78702 8600000US… 78702 1.29e7 491456
5 MULTIPOLYGON (((-97.749349 30.… 78741 8600000US… 78741 1.97e7 557552
6 MULTIPOLYGON (((-97.638821 30.… 78653 8600000US… 78653 2.71e8 1296973
str(zips)spc_tbl_ [54 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ the_geom : chr [1:54] "MULTIPOLYGON (((-97.52751099999999 30.092827999999997, -97.527214 30.094862, -97.52465099999999 30.097151999999"| __truncated__ "MULTIPOLYGON (((-97.84494199999999 30.200414, -97.839648 30.199551999999994, -97.833557 30.209342999999993, -97"| __truncated__ "MULTIPOLYGON (((-97.73764299999999 30.303493, -97.73297600000001 30.310843, -97.737571 30.313464, -97.735227 30"| __truncated__ "MULTIPOLYGON (((-97.737174 30.259136, -97.732996 30.270465, -97.730048 30.278506, -97.728487 30.278568, -97.711"| __truncated__ ...
$ Zip Code : num [1:54] 78612 78745 78751 78702 78741 ...
$ AFFGEOID10: chr [1:54] "8600000US78612" "8600000US78745" "8600000US78751" "8600000US78702" ...
$ GEOID10 : num [1:54] 78612 78745 78751 78702 78741 ...
$ ALAND10 : num [1:54] 2.43e+08 3.46e+07 6.21e+06 1.29e+07 1.97e+07 ...
$ AWATER10 : num [1:54] 1463994 0 0 491456 557552 ...
- attr(*, "spec")=
.. cols(
.. the_geom = col_character(),
.. `Zip Code` = col_double(),
.. AFFGEOID10 = col_character(),
.. GEOID10 = col_double(),
.. ALAND10 = col_double(),
.. AWATER10 = col_double()
.. )
- attr(*, "problems")=<externalptr>
skim(zips)| Name | zips |
| Number of rows | 54 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| the_geom | 0 | 1 | 811 | 10887 | 0 | 54 | 0 |
| AFFGEOID10 | 0 | 1 | 14 | 14 | 0 | 54 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Zip Code | 0 | 1 | 78716.44 | 40.9 | 78610 | 78704.25 | 78729.5 | 78744.75 | 78759 | ▁▂▁▅▇ |
| GEOID10 | 0 | 1 | 78716.44 | 40.9 | 78610 | 78704.25 | 78729.5 | 78744.75 | 78759 | ▁▂▁▅▇ |
| ALAND10 | 0 | 1 | 54514716.76 | 69396455.1 | 824603 | 19996384.75 | 33872799.5 | 54926871.00 | 327324441 | ▇▁▁▁▁ |
| AWATER10 | 0 | 1 | 815146.96 | 2006503.2 | 0 | 0.00 | 45635.5 | 541028.00 | 10884130 | ▇▁▁▁▁ |
Important to note that the dataset has several date fields, and that alone will be ripe with opportunity for exploration. However I want to take a look at some of the other variables first, to see if maybe isolating to one of the other categories, or including other categorical variables in the time series could be useful. First off is Zip Code, because location is often a major factor in determining the amount and type of crime that would occur.
d1 %>%
mutate(years = year(Occurred.Date),
years = as.character(years)
) %>%
group_by(years, Zip.Code) %>%
summarize(
count = n()
) %>% as.data.frame() %>% pivot_wider(names_from = years, values_from = count)`summarise()` has grouped output by 'years'. You can override using the
`.groups` argument.
# A tibble: 69 × 23
Zip.Code `2003` `2004` `2005` `2006` `2007` `2008` `2009` `2010` `2011`
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 76574 1 4 1 NA 2 6 NA 1 NA
2 78610 1 17 14 10 5 8 9 6 5
3 78612 1 NA 1 3 NA 2 NA NA NA
4 78613 379 435 301 343 338 470 461 442 431
5 78617 748 756 722 744 840 1035 1038 1084 1063
6 78620 2 1 5 2 3 1 1 NA NA
7 78634 2 NA NA 1 1 1 NA NA NA
8 78641 11 3 6 9 5 11 1 3 NA
9 78642 1 NA NA NA NA 1 NA NA NA
10 78645 4 1 5 3 4 4 NA 1 NA
# ℹ 59 more rows
# ℹ 13 more variables: `2012` <int>, `2013` <int>, `2014` <int>, `2015` <int>,
# `2016` <int>, `2017` <int>, `2018` <int>, `2019` <int>, `2020` <int>,
# `2021` <int>, `2022` <int>, `2023` <int>, `2024` <int>
Layering in the year gives an interesting view, because it gives us the opportunity to see major increases in certain locations over time. However, there are a lot of zip Codes and a lot of years in this dataset. A visualization may help with this, my first thought being a chloropeth plot with a slider for the year value. Other quicker solutions might be looking at the individual years as observations of sorts, then looking at summary statistics. More to come here.
d1 %>%
mutate(years = year(Occurred.Date)
#years = as.character(years)
) %>%
#filter(years == '2023' | years == '2022') %>%
#group_by(Family.Violence, Highest.Offense.Description, years) %>%
group_by(Highest.Offense.Description, years) %>%
summarize(cnt = n(), .groups = 'drop') %>%
#group_by(Highest.Offense.Description, years) %>%
#mutate(tot = sum(cnt)) %>%
#relocate(tot, .after = years) %>%
mutate(Highest.Offense.Description = as.factor(Highest.Offense.Description)) %>%
as.data.frame() %>%
arrange(years) %>%
pivot_wider(names_from = years, values_from = cnt) # A tibble: 436 × 23
Highest.Offense.Description `2003` `2004` `2005` `2006` `2007` `2008` `2009`
<fct> <int> <int> <int> <int> <int> <int> <int>
1 ABUSE OF OFFICIAL CAPACITY 1 5 2 2 4 2 1
2 AGG ASLT W/MOTOR VEH FAM/DA… 25 41 37 55 52 54 50
3 AGG ASSAULT 718 809 805 810 849 971 885
4 AGG ASSAULT FAM/DATE VIOLEN… 426 446 497 564 610 628 605
5 AGG ASSAULT ON PUBLIC SERVA… 26 25 16 25 14 11 24
6 AGG ASSAULT WITH MOTOR VEH 119 147 104 88 109 121 102
7 AGG FORCED SODOMY 5 2 2 2 3 3 3
8 AGG FORCED SODOMY OF CHILD 36 34 24 23 11 15 1
9 AGG KIDNAPPING 7 6 7 6 8 9 9
10 AGG PERJURY 1 NA NA NA NA NA NA
# ℹ 426 more rows
# ℹ 15 more variables: `2010` <int>, `2011` <int>, `2012` <int>, `2013` <int>,
# `2014` <int>, `2015` <int>, `2016` <int>, `2017` <int>, `2018` <int>,
# `2019` <int>, `2020` <int>, `2021` <int>, `2022` <int>, `2023` <int>,
# `2024` <int>
Similar to our zip code variable here, but there are far too many crime descriptions to use. Grouping some together may work, I don’t think we have enough information in the dataset overall to do any sort of classification modeling however, so it may amount to a rules based classification using regular expressions. Other variables may still help with this, such as the UCF classification variables.
d1 %>%
mutate(
Category.Description = if_else(
Category.Description == '', 'NA/Unknown', Category.Description
)
) %>%
group_by(Category.Description, Highest.Offense.Description) %>%
summarize(
cnt = n()
) %>%
pivot_wider(
names_from = Category.Description, values_from = cnt
)`summarise()` has grouped output by 'Category.Description'. You can override
using the `.groups` argument.
# A tibble: 436 × 9
Highest.Offense.Description `Aggravated Assault` `Auto Theft` Burglary Murder
<chr> <int> <int> <int> <int>
1 AGG ASLT ENHANC STRANGL/SU… 1098 NA NA NA
2 AGG ASLT STRANGLE/SUFFOCATE 7712 NA NA NA
3 AGG ASLT W/MOTOR VEH FAM/D… 772 NA NA NA
4 AGG ASSAULT 18634 NA NA NA
5 AGG ASSAULT BY PUBLIC SERV… 10 NA NA NA
6 AGG ASSAULT FAM/DATE VIOLE… 9603 NA NA NA
7 AGG ASSAULT ON PEACE OFFIC… 70 NA NA NA
8 AGG ASSAULT ON PUBLIC SERV… 329 NA NA NA
9 AGG ASSAULT WITH MOTOR VEH 1823 NA NA NA
10 ARSON WITH BODILY INJURY 15 NA NA NA
# ℹ 426 more rows
# ℹ 4 more variables: `NA/Unknown` <int>, Rape <int>, Robbery <int>,
# Theft <int>
The UCF classifiers are much more consumable, but there are more incidents that are not classified than are. That said, this might be a good start for a rules based classification. Many of the unclassified incidents would not fit very nicely into the groups anyway, so some new categories may need to be made. I may or may not come back to the re-classification part, but ultimately the genesis of this project was with reports on the amount of homicide in the city, so the already existing descriptions may be enough to drill down into just those.
reminder to myself to save figures from this file to bring into manuscript and presentation later.
Data exploration through figures
Time is a variable that is much easier to interpret with a plot, so let’s start there. Time series first.
d1 %>% group_by(Occurred.Date) %>%
summarize(
cnt = n()
) %>% as.data.frame() %>%
ggplot(mapping = aes(x=Occurred.Date, y=cnt)) +
geom_line()
I expected aggregating by day to actually look worse than this. There is a clear trend, upwards at first but then starts decreasing around 2008 or so. Seeing this I am tempted to looking into time series modeling. There may be a seasonal pattern in some of those spikes, but that may be something that can be resolved with different methods. I’m tempted to go ARIMA, because frankly that’s the one I’m familiar with, but I’ll have to do some reading up on my options. An interesting idea might be to fit the model, make predictions, and then see where I landed compared to the actual data before the class ends.
d1 %>% filter(Occurred.Date <= '2023-12-31') %>%
mutate(Occurred.Date = trunc.Date(Occurred.Date, 'months')) %>%
group_by(Occurred.Date) %>%
summarize(
cnt = n()
) %>% as.data.frame() %>%
ggplot(mapping = aes(x=Occurred.Date, y=cnt)) +
geom_line()
Monthly looks even better, again further lending itself to the idea of a time series model. The downward spikes are more prevalent here and may cause problems themselves as well, but again the seasonality may be able to be resolved with things like logs or differencing. As a note, in this and the last plot I filter out 2024, mostly due to a large drop at the end from the incomplete month, but also for the aforementioned idea of comparing predictions to actual results.
A potentially quick way to identify “peak” months for crime would be via a bar chart aggregating all occurrences in each month together. I was hoping to see some quick trends here, but I see surprisingly little here. The highest month is May, potentially because of school letting out/ graduations for UT Austin, but I wouldnt say the difference is large enough compared to other months to really point to any one reason. This may be another one to layer in with time, like a chart with a slider for the year to see shifts in crime frequency by month over time.
d2 <- d1 %>%
mutate(
Occurred.Month = month(Occurred.Date, label = TRUE),
Occurred.Day = weekdays(Occurred.Date),
Occurred.Year = year(Occurred.Date),
Occur.Report.Diff = Report.Date - Occurred.Date
)
d2 %>%
#filter(Occurred.Year == 2023) %>%
ggplot(aes(x=Occurred.Month)) +
geom_bar()
Now a Zip Code bar chart. Really a bit of a graphical representation of what we saw earlier, though I filtered it to the two most recent complete years. Again, looking for big swings, and simultaneously taking a quick peak at the Family Violence indicator. Surprisingly low volume there, which leads me to believe it just doesn’t get captured appropriately every time, especially when comparing with the descriptions themselves, which we will see in a moment. Going back to the zip codes, there might be an opportunity to do something like binning the zip codes with high medium and low crime areas, or something of the sort.
d1 %>%
mutate(years = year(Occurred.Date),
years = as.character(years)
) %>%
filter(years == '2003' | years == '2022') %>%
group_by(Family.Violence, Zip.Code, years) %>%
summarize(cnt = n(), .groups = 'drop') %>%
group_by(Zip.Code, years) %>%
mutate(tot = sum(cnt)) %>%
relocate(tot, .after = years) %>%
mutate(Zip.Code = as.character(Zip.Code)) %>%
as.data.frame() %>%
arrange(desc(tot)) %>% head(100) %>%
ggplot(aes(fill=Family.Violence, x= reorder(Zip.Code,cnt), y=cnt)) +
geom_bar(position = 'stack', stat='identity') +
facet_wrap(~years) +
coord_flip()
Now a bar chart by crime description. This is similar to the last plot, but a little more disparate here. There are so many more crime descriptions than zip codes, so they are much more easily spread thin. That said, that actually makes the top five crime descriptions stand out a little more. So going back to the idea of creating new classifications, that may be a good idea to make sure those 5 are appropriately captured. And again, the family violence indicator is captured almost entirely by one description, yet the top description is Family disturbance. An easy explanation may be because the description is of the Highest Offense in the incident, so when there is physical family violence, it is often the highest offense.
d1 %>%
mutate(years = year(Occurred.Date),
years = as.character(years)
) %>%
filter(years == '2023' | years == '2022') %>%
group_by(Family.Violence, Highest.Offense.Description, years) %>%
summarize(cnt = n(), .groups = 'drop') %>%
group_by(Highest.Offense.Description, years) %>%
mutate(tot = sum(cnt)) %>%
relocate(tot, .after = years) %>%
mutate(Highest.Offense.Description = as.factor(Highest.Offense.Description)) %>%
as.data.frame() %>%
arrange(desc(tot)) %>% head(50) %>%
ggplot(aes(fill=Family.Violence, x= reorder(Highest.Offense.Description,cnt), y=cnt)) +
geom_bar(position = 'stack', stat='identity') +
facet_wrap(~years) +
coord_flip() 
Notes
Including this sandbox chunk where I can experiment with code as needed. ARIMA is not going well, mostly due to seasonal nature of data. Filtering the time range might help, or isolating other variables to do things like predict certain types of crime instead.
tig_zips <- zctas(cb=TRUE, starts_with = c(unique(d1$Zip.Code)), year = 2020)ZCTAs can take several minutes to download. To cache the data and avoid re-downloading in future R sessions, set `options(tigris_use_cache = TRUE)`
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
yearlyCrime <- d1 %>% mutate(
Zip.Char = as.character(Zip.Code),
years = year(Occurred.Date),
years = as.character(years)
) %>% filter(years != 2024) %>%
group_by(Zip.Char, years) %>%
summarize(crim.count = n()) %>%
ungroup() %>%
as.data.frame()`summarise()` has grouped output by 'Zip.Char'. You can override using the
`.groups` argument.
#filter(count >= 500) %>%
#filter(Zip.Char == '78741') %>%
allYrsZips <- expand.grid(years = unique(yearlyCrime$years), Zip.Char = unique(yearlyCrime$Zip.Char))
yearlyCrimeAll <- left_join(allYrsZips, yearlyCrime, by = join_by(Zip.Char, years)) %>%
inner_join(tig_zips, by = join_by(Zip.Char == ZCTA5CE20)) %>%
mutate(crim.count = ifelse(is.na(crim.count), 0, crim.count))cpeth2 <- yearlyCrimeAll %>% #filter(crim.count >= 10) %>%
ggplot() +
geom_sf(aes(geometry = geometry, fill = 0)) +
geom_sf(aes(geometry = geometry, fill = -crim.count, frame = years, label = Zip.Char)) +
#scale_fill_gradient(trans = 'reverse')
scale_fill_viridis_c(limits = c(-13000, 0), oob = scales::squish, direction = -1)Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
: Ignoring unknown aesthetics: frame and label
cpeth2
cplotly2 <- cpeth2 %>% ggplotly(originalData=FALSE) %>% animation_opts(1500, 1)
cplotly2